# First things first , clean your R environment
rm(list = ls(all=TRUE))
# setwd("") # Set the working directory
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
dataFile = "boston.csv"
boston_housing_data = read.csv(dataFile)
attach(boston_housing_data) # by attaching the dataframe to the memory, we can directly use the variable names
Data description The Boston data frame has 506 rows and 14 columns.
This data frame contains the following columns:
CRIM : per capita crime rate by town.
ZN : proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS : proportion of non-retail business acres per town.
CHAS : Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
NOX : nitrogen oxides concentration (parts per 10 million).
RM : average number of rooms per dwelling.
AGE : proportion of owner-occupied units built prior to 1940.
DIS : weighted mean of distances to five Boston employment centres.
RAD : index of accessibility to radial highways.
TAX : full-value property-tax rate per $10,000.
PT : pupil-teacher ratio by town.
B : 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
LSTAT : lower status of the population (percent).
MV : median value of owner-occupied homes in $1000s.
str()summary()dim(boston_housing_data)
## [1] 506 14
colnames(boston_housing_data)
## [1] "CRIM" "ZN" "INDUS" "CHAS" "NOX" "RM" "AGE" "DIS"
## [9] "RAD" "TAX" "PT" "B" "LSTAT" "MV"
str()str(boston_housing_data)
## 'data.frame': 506 obs. of 14 variables:
## $ CRIM : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ ZN : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS: num 2.31 7.07 7.07 2.18 2.18 ...
## $ CHAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 ...
## $ RM : num 6.57 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : int 296 242 242 222 222 222 311 311 311 311 ...
## $ PT : num 15.3 17.8 17.8 18.7 18.7 ...
## $ B : num 397 397 393 395 397 ...
## $ LSTAT: num 4.98 9.14 4.03 2.94 5.33 ...
## $ MV : num 24 21.6 34.7 33.4 36.2 ...
summary()summary(boston_housing_data)
## CRIM ZN INDUS CHAS
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## NOX RM AGE DIS
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## RAD TAX PT B
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## LSTAT MV
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
sum(is.na(boston_housing_data))
## [1] 0
boston_housing_data$CHAS_F = factor(boston_housing_data$CHAS)
#dummies::dummy(boston_housing_data$CHAS_F)
boston_housing_data$CHAS_F1 = dummies::dummy(boston_housing_data$CHAS_F)[,1]
boston_housing_data$CHAS_F = NULL
pairs(~CRIM+ZN+INDUS+NOX+RM+LSTAT,data=boston_housing_data,main="Scatterplot matrix with selected attributes")
pairs(~.,data=boston_housing_data,main="Scatterplot matrix")
correlation_XPairwise = cor(boston_housing_data[, !(colnames(boston_housing_data) %in% c("MV","CHAS_F1"))])
corrplot(correlation_XPairwise)
write.table(correlation_XPairwise,file="pairwiseCorrelations1.csv",row.names=FALSE,col.names=FALSE,sep=",")
lineFitSinglePredictor = lm(MV ~ LSTAT, data = boston_housing_data); # fit a simple linear regression model
summary(lineFitSinglePredictor)
##
## Call:
## lm(formula = MV ~ LSTAT, data = boston_housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## LSTAT -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
plotTitle = paste("Boston housing dataset : \n Median House Value vs %Lower Status Population\n Correlation coefficient = ",toString(cor(boston_housing_data$MV,boston_housing_data$LSTAT)),"\n MV = ",toString(lineFitSinglePredictor$coefficients[1]), "+ LSTAT*(",toString(lineFitSinglePredictor$coefficients[2]),")\n\n",sep=" ")
plot(boston_housing_data$LSTAT,boston_housing_data$MV,main=plotTitle,xlab="%Lower Status Population (LSTAT)",ylab="Median House Value (MV)")
abline(lineFitSinglePredictor,col="steelblue",lty=1,lwd=4)
summary(lineFitSinglePredictor)
##
## Call:
## lm(formula = MV ~ LSTAT, data = boston_housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## LSTAT -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
plot(lineFitSinglePredictor,main="Linear model with a single predictor")
cd = cooks.distance(lineFitSinglePredictor)
plot(cd,main=paste("boston_housing_data"," : \n Cook's distances for each sample for the Simple Linear Model",sep=" "),xlab="Index",ylab="Cook's distance")
grid(10,10,lwd=2)
lineFitMultiplePredictors = lm(boston_housing_data$MV ~ B+CRIM+RM+LSTAT)
summary(lineFitMultiplePredictors)
##
## Call:
## lm(formula = boston_housing_data$MV ~ B + CRIM + RM + LSTAT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.016 -3.494 -1.223 1.986 29.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.941106 3.498223 -2.270 0.023629 *
## B 0.010247 0.002968 3.452 0.000602 ***
## CRIM -0.074057 0.032766 -2.260 0.024237 *
## RM 5.389120 0.440138 12.244 < 2e-16 ***
## LSTAT -0.535983 0.048740 -10.997 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.431 on 501 degrees of freedom
## Multiple R-squared: 0.6541, Adjusted R-squared: 0.6513
## F-statistic: 236.8 on 4 and 501 DF, p-value: < 2.2e-16
plot(lineFitMultiplePredictors,main="Linear model with a multiple predictor")
lineFit_All = lm(MV ~ ., data = boston_housing_data)
summary(lineFit_All)
##
## Call:
## lm(formula = MV ~ ., data = boston_housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## CRIM -1.080e-01 3.286e-02 -3.287 0.001087 **
## ZN 4.642e-02 1.373e-02 3.382 0.000778 ***
## INDUS 2.056e-02 6.150e-02 0.334 0.738287
## CHAS 2.687e+00 8.616e-01 3.118 0.001925 **
## NOX -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## RM 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## AGE 6.922e-04 1.321e-02 0.052 0.958229
## DIS -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## RAD 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## TAX -1.233e-02 3.760e-03 -3.280 0.001112 **
## PT -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## B 9.312e-03 2.686e-03 3.467 0.000573 ***
## LSTAT -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## CHAS_F1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
lineFit_Interaction = lm(MV ~ LSTAT * AGE, data = boston_housing_data)
summary(lineFit_Interaction)
##
## Call:
## lm(formula = MV ~ LSTAT * AGE, data = boston_housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885364 1.4698355 24.553 < 2e-16 ***
## LSTAT -1.3921169 0.1674555 -8.313 8.78e-16 ***
## AGE -0.0007209 0.0198792 -0.036 0.9711
## LSTAT:AGE 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
plot(lineFit_Interaction)
lineFit_NonLinearTransform = lm(MV ~ CRIM + NOX + RM + DIS + PT + B + LSTAT + I(LSTAT ^ 2), data=boston_housing_data)
lineFit_NonLinearTransform_Categorical = lm(MV ~ CRIM + CHAS_F1 + NOX + RM + DIS + PT + B + LSTAT + I(LSTAT ^ 2), data=boston_housing_data)
summary(lineFit_NonLinearTransform_Categorical)
##
## Call:
## lm(formula = MV ~ CRIM + CHAS_F1 + NOX + RM + DIS + PT + B +
## LSTAT + I(LSTAT^2), data = boston_housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.753 -2.617 -0.333 1.750 26.172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.638373 4.530364 8.529 < 2e-16 ***
## CRIM -0.096228 0.027673 -3.477 0.000551 ***
## CHAS_F1 -2.828852 0.789583 -3.583 0.000374 ***
## NOX -11.399562 2.973082 -3.834 0.000142 ***
## RM 3.608807 0.372924 9.677 < 2e-16 ***
## DIS -1.175195 0.148926 -7.891 1.93e-14 ***
## PT -0.704267 0.104296 -6.753 4.07e-11 ***
## B 0.006909 0.002446 2.824 0.004928 **
## LSTAT -1.720591 0.119663 -14.379 < 2e-16 ***
## I(LSTAT^2) 0.034516 0.003207 10.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.394 on 496 degrees of freedom
## Multiple R-squared: 0.7758, Adjusted R-squared: 0.7717
## F-statistic: 190.7 on 9 and 496 DF, p-value: < 2.2e-16
plot(lineFit_NonLinearTransform,main="Linear model with multiple predictors and non-linear transformations")
set.seed(123) # for reproducible results
sample.size <- floor(0.75 * nrow(boston_housing_data))
train.index <- sample(seq_len(nrow(boston_housing_data)), size = sample.size)
train <- boston_housing_data[train.index,]
test <- boston_housing_data[- train.index,]
testPredictions_simple_linear_model = predict(lineFitSinglePredictor, test, interval = "confidence")
testPredictions_multiple_linear_model = predict(lineFit_All, test, interval = "confidence")
## Warning in predict.lm(lineFit_All, test, interval = "confidence"):
## prediction from a rank-deficient fit may be misleading
regr.eval(trues = test[,c("MV","LSTAT")],preds = testPredictions_simple_linear_model)
## mae mse rmse mape
## 1202.9424 20145.8429 141.9361 140.7144
regr.eval(trues = test,preds = testPredictions_multiple_linear_model)
## mae mse rmse mape
## 8105.694 2529351.586 1590.394 Inf